Task 1

# load packages
library(tidyverse)
library(RColorBrewer)
library(lubridate)
library(gganimate) 
library(ggimage)

# read in datasets for Task 1 and 2 
launches <- read_csv("launches.csv")
usda <- read_csv("usda_nutrients.csv")
# Make launch data frame with country -> change from state_code, launch year, and number of launches per year for the graph with top 4 countries and the rest as "other"

launch_1 <- launches %>% 
  mutate(country = fct_collapse(state_code,
                                "Russia/USSR" = c("RU", "SU"),  
                                "United States" = "US",
                                "China" = "CN",
                                "France" = "F"),
         country = fct_lump(country, 4)) %>%  
  group_by(country, launch_year) %>% 
  count() 

# Want to find the percent of success per country  
# make a dataframe with counts of number of successes per country 

launch_2 <- launches %>% 
  select(state_code, category) %>% 
  mutate(country = fct_collapse(state_code,
                                "Russia/USSR" = c("RU", "SU"),  
                                "United States" = "US",
                                "China" = "CN",
                                "France" = "F"),
         country = fct_lump(country, 4)) %>% 
    group_by(country, category) %>% 
  count()

# number of successes per country from the counts in launch_2: C-289, F-284, USSR-3024, US-1585, Other-202

# use that dataframe to calculate the percent of successful launches by country 

#first make a dataframe that just has total number of launches by country 

launch_success <- launches %>% 
  mutate(country = fct_collapse(state_code,
                                "Russia/USSR" = c("RU", "SU"),  
                                "United States" = "US",
                                "China" = "CN",
                                "France" = "F"),
         country = fct_lump(country, 4)) %>% 
    group_by(country) %>% 
  count()

# add a column for number of successes using values found from counts in launch_2 dataframe 

launch_success$number_success <- c(289, 284, 3024, 1585, 202)

# final dataframe that finds the percentage of success 

success <- launch_success %>% 
  mutate(percent = number_success/n) %>% 
  select(country, percent)

# join data frames for final graph 

launch_data <- left_join(launch_1,success, by="country")
# Read in rocket image for gganimate 

rocket <- "rocket.png"
# Graph for year vs. counts by country, color = percent of successes per country 

launch_graph <- ggplot(launch_data, aes(x=launch_year, y = n, group=country, color=percent)) +
  geom_line() +
  geom_image(aes(image = rocket)) +
  scale_y_continuous(expand = c(0,0), limits = c(0,120), breaks = seq(0,120, by = 15)) +
  scale_x_continuous(expand = c(0,0), limits = c(1955, 2020), breaks = seq(1955,2020, by=5)) +
  labs(x="Launch Year", y = "Number of Launches") +
  scale_color_distiller(palette = "YlGnBu", direction = 1, 
                        name = "Percent of Successful\nLaunches by Country") +
  theme(panel.background = element_rect(fill = "grey86"),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.border = element_rect(color="black", fill = NA)) +
  geom_text(aes(label= country, color = percent), position = position_nudge(y = 0.04, x = 9), 
            size = 5) +
  transition_reveal(country, launch_year)

launch_graph

Figure 1: Top Five Countries in Space Launches Since 1955. The top four countries with the largest number of space launches are shown, all other countries are aggregrated into category “Other”. Color indicates the proportion of successful launches out of the total launches per country in the time period from 1955 to 2018.

Task 2

# load packages for Task 2 to avoid conflicts in task 1

library(factoextra)
library(ggbiplot)
# Data wrangling: include food groups "vegetable and vegetable products" and "fruits and fruit juices", retain observations where short_descrip contains "RAW"

nutrients <- usda %>% 
  filter(food_group  == "Vegetables and Vegetable Products" |
           food_group == "Fruits and Fruit Juices") %>%
  filter(str_detect(short_descrip, pattern="RAW"))
# PCA for variables from Protein through Zinc

nutrient_pca <- prcomp(nutrients[9:30], scale = TRUE)

fviz_pca_biplot(nutrient_pca,
             habillage = nutrients$food_group,
             palette = c("darkorange2", "darkgreen"),
             label = "var",
             repel = TRUE,
             col.var="midnightblue") +
  labs(title = "PCA Biplot: Fruit and Vegetable Nutrients",
       x = "Dimension 1 (28.1%)",
       y = "Dimension 2 (10.5%)") +
  theme_bw()

Trends
From the biplot above, there is a negative correlation between sugar and protein, which makes sense because foods that are high in sugar are typically not high in protein. There is a positive correlation between copper and phosphorous and no correlation between magnesium and Vitamin B12. Fruits and fruit juices are more concentrated to the left of the biplot, indicating that in general, fruits are higher in sugar. While vegetables and vegetable products are more spread out and there are more points to the right of the biplot indicating that vegetables generally have higher nutrient concentrations.